# @Author  : CHEN ZHI LING
# @Time    : 2021/7/9
# @Description


import logging
import re
import MeCab

'''
加载停用词列表
'''


def get_stopwords():
    logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
    # 加载停用词表
    stopwords_set = set()
    with open("stopWord.txt", 'r', encoding="utf-8") as stopwords:
        for stopWord in stopwords:
            stopwords_set.add(stopWord.strip("\n"))
    return stopwords_set


'''
使用正则表达式解析文本
'''


def parse_zhwiki(read_file_path, save_file_path):
    # 过滤掉<doc>
    regex_str = "[^<doc.*>$]|[^</doc>$]"
    mecab = MeCab.Tagger('-Owakati')
    file = open(read_file_path, "r", encoding="utf-8")
    # 写文件
    output = open(save_file_path, "a+", encoding="utf-8")
    content_line = file.readline()
    # 获取停用词表
    stopwords = get_stopwords()
    # 定义一个字符串变量，表示一篇文章的分词结果
    article_contents = ""
    while content_line:
        content_line = content_line.strip("\n")
        if len(content_line) > 0:
            words = mecab.parse(content_line)
            for word in words:
                if word not in stopwords:
                    article_contents += word + ""
        article_contents = re.sub(' +', ' ', article_contents)
        article_contents = article_contents.strip()
        if len(article_contents) > 0:
            output.write(article_contents)
            article_contents = ""
        content_line = file.readline()
    output.close()


a = 'D:/word2vec数据/cleandata1/BB0000.txt'
b = 'D:/word2vec数据/cleandata1/corpus1.txt'

parse_zhwiki(a, b)
